Setup

library(tidyverse)
library(ape)
library(phangorn)
library(pegas)
library(strataG)
library(knitr)
library(emo)
library(coda)
library(ggmcmc)
library(perm)

# Function library
harvest.model.likelihoods <- function(workingDir = workingDir,
                                      outfileName = "outfile.txt",
                                      multilocus = T){
    # this function harvests model marginal likelihoods for models calculated by
    # the program migrate-n (Beerli & Felsenstein 2001).
    # It takes as input a directory full of directories, 
    # each of which contains output from a migrate model, and is named
    # after that model. 
  
    #initialize a data frame to take the values
    modelMarglikes <- data.frame(model=character(),
                             thermodynamic=numeric(),
                             bezier.corrected=numeric(), 
                             harmonic=numeric()) 
    # loop through directories in the working directory, each of which is name
    # after a different model
  for(i in list.dirs(workingDir, full.names = F)[-1]){ #i<-"stepping.stone"
      modelDir<-file.path(workingDir,i)
      print(modelDir)
    #scan in the outfile, separating at each newline
      outfile<-scan(file=file.path(modelDir,outfileName),what="character",sep="\n") 
    #find the line with the likelihoods on it and split on runs of spaces
      marglikeline <- strsplit(grep("  All          ",outfile,value=T),
                               "\\s+", perl = T)[[1]][3:5]
    #  if(length(marglikeline)==0){next}
      marglikes <- c(i,marglikeline)
     
      modelMarglikes <- rbind(modelMarglikes,marglikes, deparse.level = 2)
  }
  names(modelMarglikes) <- c("model","thermodynamic","bezier.corrected","harmonic")
  modelMarglikes[2:4] <- sapply(modelMarglikes[2:4], as.numeric)
  return(modelMarglikes)
}

bfcalcs<-function(df,ml="bezier.corrected"){
  # This calculates log bayes factors on data frames output by
  # harvest.model.likelihoods(), following Johnson and Omland (2004)
  # You may choose the likelihood flavor with
  # ml = "bezier.corrected", "thermodynamic" or "harmonic"
  #df$thermodynamic <- as.numeric(df$thermodynamic)
  #df$bezier.corrected <- as.numeric(df$bezier.corrected)
  #df$harmonic <- as.numeric(df$harmonic)
  mlcol <- df[[ml]] 
    bmvalue <- mlcol[which.max(mlcol)]
    lbf <- 2*(mlcol-bmvalue)
    choice <- rank(-mlcol)
    modelprob <- exp(lbf/2)/sum(exp(lbf/2))
    dfall <- cbind(df,lbf,choice,modelprob)
    return(dfall)
}   

migrants.per.gen<-function(x){
  #a function for creating Nm vectors out of m and Theta vectors.
  #x<-x[[1]]
  m<-names(x)[which(grepl("M_",names(x)))] #names of m columns
  #theta<-names(x)[which(grepl("Theta_",names(x)))] #names of theta columns
  for(n in m){
    t<-paste("Theta",strsplit(n,split="_")[[1]][3],sep="_")
    x[,paste("Nm",strsplit(n,split="_")[[1]][2],strsplit(n,split="_")[[1]][3],sep="_")]<- x[,which(names(x)==n)]*x[,which(names(x)==t)] 
    #this hairy little statement makes a new column named "Nm_X_Y" and then fills it by multiplying the M_X_Y column by the Theta_Y column  
  }
  return(x)
}

Introduction

Rob Toonen asked me to take a look at this dataset sequenced and analyzed by ’Ale’alani Dudoit.

From Rob:

It is a zoanthid - Palythoa tuberculosa - that is most common in anthropogenically disturbed habitats. It seems like it could be a cool story, and she has plenty of SNPs or contigs to do whatever analysis you think would be most interesting to work up.

’Ale’a:

Mahalo for checking out our data Eric, would be great to try out Migrate-n and see what results come of it. A couple other folks in the lab had similar interesting fst/structure results with their data (corals and fish) where O’ahu is more genetically similar to Kure/Midway atoll than to neighboring islands. Would be interesting to see if we get any cool results as Rob mentioned with military transport b/w O’ahu and Midway during WWII. I can send along my TotalRawSNPs and Filtered vcf files through scp command if you have a destination path you can provide me. Please let me know what other info you need from me, happy to send along.

Eric:

So it turns out that dDocent (or actually an associated perl script) can make haplotypes, but it will need the bam files for each individual. Supposedly we have unlimited storage on Google Drive at PSU, so I’m going to test that. I’ve shared a folder that you can use to upload the bam files.

Transfer files

So, now I’ve received all .bam and .bai files for each individual, as well as locality metadata and total and filtered SNP datasets from ’Ale’a on Google Drive. I compressed these on my mac, and now need to transfer them to Argonaute. I will do this with the gdown python package (pip install gdown). This involves looking up the google id for each file in the URL of the share link and pasting that into gdown thusly

gdown https://drive.google.com/uc?id=19nagmzHPQgKE4PPlyHikcWc7lBZFMbXB --output ptuberculosa_metadata.csv
gdown https://drive.google.com/uc?id=19g9fMRdPH-_AG-l6kytu-lv6kd3qYpQD --output popmap
gdown https://drive.google.com/uc?id=1ASB4yOnwDzDhFPA7Pev4CyIl6M6SX8_P --output popmap.csv
gdown https://drive.google.com/uc?id=19ZsobdvBIs3bQukW0Q-ivpdlJEdGguns --output reference.fasta
gdown https://drive.google.com/uc?id=19XNrJvZKMQ3L7qTIuExajc5KY6-gj3P8 --output mm0.9minq30mac3thin500.recode.vcf
gdown https://drive.google.com/uc?id=19vth-3IMkfMfxq4cp0QLOUgC2yFqnZ3C --output TotalRawSNPs.vcf
gdown https://drive.google.com/uc?id=1I0ib-QX4eyedctD951os6gvAMUe761JU --output bam_files.zip


gdown https://drive.google.com/uc?id=1ES3yMzZS8JkWROiE1jdfTp6KBjxDdCXO --folder --remaining-ok --output firstbatch

Haplotypes from dDocent

Filter SNPs

‘Ale’ already filtered the SNPs quite a bit, but I need to start that process over because coalescent programs are sensitive to ascertainment bias. Also, need to knit them back into haplotypes. As Peter Beerli says on page 15 of the migrate-n manual:

We use a rather restrictive ascertainment models for SNPs ?. A better approach than using SNPs is the use of short reads which may or many not contain SNPs. I find that SNPs are an inferior datatype because commonly researchers are adding criteria such as a minor SNP allele must occur at a frequency higher than x, and singletons are excluded etc.

We have found ALL variable sites and use them even if there are only a few members of another alleles present. In principal it is as you would sequence a stretch of DNA and then remove the invariant sites. Each stretch is treated as completely linked. You can combine many of such “loci” to improve your estimates.

I’ve installed dDocent into a dedicated environment following instructions here.

conda activate ddocent_env

Following the tutorial, but altering filtering parameters. Keeping all SNPs with even a single occurence (Minor Allele Count = 1), but they must occur in 90% of individuals (mac 1) and be of high quality (minQ 30). We are keeping genotypes with as few as 3 reads. This is because evidence to keep these SNPs is based on their existence in multiple individuals.

vcftools --vcf ../TotalRawSNPs.vcf --max-missing 0.1 --mac 1 --minQ 30 --minDP 3 --recode --recode-INFO-all --out rawg.1mac1dp3

After filtering, kept 93 out of 93 Individuals Outputting VCF file… After filtering, kept 28128 out of a possible 52804 Sites Run Time = 9.00 seconds

Run this error checking script to see the potential impact of shallow-sequenced genotypes

ErrorCount.sh rawg.1mac1dp3.recode.vcf

<!-- This script counts the number of potential genotyping errors due to low read depth -->
<!-- It report a low range, based on a 50% binomial probability of observing the second allele in a heterozygote and a high range based on a 25% probability. -->
<!-- Potential genotyping errors from genotypes from only 1 read range from 0.0 to 0.0 -->
<!-- Potential genotyping errors from genotypes from only 2 reads range from 0.0 to 0.0 -->
<!-- Potential genotyping errors from genotypes from only 3 reads range from 3808.875 to 12797.82 -->
<!-- Potential genotyping errors from genotypes from only 4 reads range from 3860.25 to 19517.424 -->
<!-- Potential genotyping errors from genotypes from only 5 reads range from 913.78125 to 6930 -->
<!-- 93 number of individuals and 28128 equals 2615904 total genotypes -->
<!-- Total genotypes not counting missing data 2508832 -->
<!-- Total potential error rate is between 0.0034210765208670807 and 0.015642834593946504 -->
<!-- SCORCHED EARTH SCENARIO -->
<!-- WHAT IF ALL LOW DEPTH HOMOZYGOTE GENOTYPES ARE ERRORS????? -->
<!-- The total SCORCHED EARTH error rate is 0.04841934414101861. -->

Look at missing data per individual

vcftools --vcf rawg.1mac1dp3.recode.vcf --missing-indv

mawk '!/IN/' out.imiss | cut -f5 > totalmissing
gnuplot << \EOF 
set terminal dumb size 120, 30
set autoscale 
unset label
set title "Histogram of % missing data per individual"
set ylabel "Number of Occurrences"
set xlabel "% of missing data"
#set yr [0:100000]
binwidth=0.01
bin(x,width)=width*floor(x/width) + binwidth/2.0
plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
                                                                                                                        
                                         Histogram of % missing data per individual                                     
     20 +-----------------------------------------------------------------------------------------------------------+   
        |    * *      +            +             +            +             +            +             +            |   
        |    * **                                             'totalmissing' using (bin($1,binwidth)):(1.0) ******* |   
        |    * **                                                                                                   |   
        |    * **                                                                                                   |   
        |    * **                                                                                                   |   
     15 |-+  * **                                                                                                 +-|   
        |    * **                                                                                                   |   
        |   ** **                                                                                                   |   
        |   ** **                                                                                                   |   
        |   ** **                                                                                                   |   
     10 |-+ ** **                                                                                                 +-|   
        |   ** **                                                                                                   |   
        |   ** ***                                                                                                  |   
        |   ** ***                                                                                                  |   
        |   ** ***                                                                                                  |   
        |   ** *****                                                                                                |   
      5 |-+ ** *** *                                                                                              +-|   
        |  *** *** ****                                                                                             |   
        |  *** *** ** *                                                                                             |   
        |  *** *** ** *                                                                                             |   
        |  *** *** ** *******                                                                                       |   
        |***** *** ** *** * *******************************************************************************         |   
      0 +-----------------------------------------------------------------------------------------------------------+   
        0            0.1          0.2           0.3          0.4           0.5          0.6           0.7          0.8  
                                                      % of missing data                                                 

‘Ale’ has probably removed any really bad individuals already. I’ll keep all of these for now.

Now filter based on missingness within populations. Puritz wrote a script for this that creates lots of output, so I created ./filterpop directory to run it in. I am removing loci that are missing 10% of data in any of 10 populations. I altered popmap to remove spaces from pop names too.

pop_missing_filter.sh ../rawg.1mac1dp3.recode.vcf ../../popmap 0.1 10 rawg1mac1dp3allpops.1.vcf

This filter keeps loci with Allele Balance between 0.25 and 0.75. From Jon:

Allele balance is: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous Because RADseq targets specific locations of the genome, we expect that the allele balance in our data (for real loci) should be close to 0.5

So this will keep only really high quality SNPs

 vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" ./filterpops/rawg1mac1dp3allpops.1.vcf.recode.vcf > rawg.1mac1dp3allpop.1.ABfil.vcf
 
 mawk '!/#/' *ABfil.vcf | wc -l
2756

Now remove SNPs that occur on both strands of a read. Jon:

Unless you are using super small genomic fragment or really long reads (MiSeq). A SNP should be covered only by forward or only reverse reads. The filter is based on proportions, so that a few extraneous reads won’t remove an entire locus…. In plain english, it’s keeping loci that have over 100 times more forward alternate reads than reverse alternate reads and 100 times more forward reference reads than reverse reference reads along with the reciprocal….That only removes a small proportion of loci, but these loci are likely to be paralogs, microbe contamination, or weird PCR chimeras.

```{basjh. eval = F} vcffilter -f “SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100” -s rawg.1mac1dp3allpop.1.ABfil.vcf > rawg.1mac1dp3allpop.1.ABfil.FRfil.vcf

mawk ‘!/#/’ rawg.1mac1dp3allpop.1.ABfil.FRfil.vcf | wc -l 7


Hmmm... that's a bit too stringent... maybe these were done on a MiSeq?


> The next filter looks at the ratio of mapping qualities between reference and alternate alleles...The rationale here is that, again, because RADseq loci and alleles all should start from the same genomic location there should not be large discrepancy between the mapping qualities of two alleles.


```bash
vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" rawg.1mac1dp3allpop.1.ABfil.vcf > rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf
mawk '!/#/' rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf | wc -l
2512

Yet another filter that can be applied is whether or not their is a discrepancy in the properly paired status of for reads supporting reference or alternate alleles….Since de novo assembly is not perfect, some loci will only have unpaired reads mapping to them. This is not a problem. The problem occurs when all the reads supporting the reference allele are paired but not supporting the alternate allele. That is indicative of a problem.

vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s rawg.1mac1dp3allpop.1.ABfil.MQfil.vcf > rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf

mawk '!/#/' rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf| wc -l
2141
dDocent_filters rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.vcf rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf 
This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth,
quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation.
The script assumes that loci and individuals with low call rates (or depth) have already been removed. 

Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters 

Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth
 1192 of 2141 

Are reads expected to overlap?  In other words, is fragment size less than 2X the read length?  Enter yes or no.
yes
Is this from a mixture of SE and PE libraries? Enter yes or no.
no
Number of additional sites filtered based on properly paired status
 0 of 949 

Number of sites filtered based on high depth and lower than 2*DEPTH quality score
 78 of 949 

                                                                                                                        
                                                                                                                        
                                               Histogram of mean depth per site                                         
       30 +---------------------------------------------------------------------------------------------------------+   
          |    +    +   **+    +    +    +     +    +    +    +     +    +    +    +     +    +    +    +     +    +|   
          |             **                                'meandepthpersite' using (bin($1,binwidth)):(1.0) ******* |   
          |         *** **                                                                                          |   
       25 |-+       * * **                                                                                        +-|   
          |        ** ****    **                                                                                    |   
          |        ** ****    **                                                                                    |   
          |        ** ****    **                                                                                    |   
       20 |-+   ** ** ****    **                                                                                  +-|   
          | **  ** ** ****    **                                                                                    |   
          | **  ** ** *****   ** **                                                                                 |   
       15 |***  ** ** *****   ** **                                                                               +-|   
          |**** ** ** *****   ** **                                                                                 |   
          |**** ** ** ******* ** **                                                                                 |   
          |**** ***** ******* ** **              **                                                                 |   
       10 |********** *************   ***        **                                                               +-|   
          |********** ************** ****     ** **                                                                 |   
          |********** ************** ****     ** **                                                                 |   
          |********** ********************** *** ***          *****                                                 |   
        5 |********** ******************** * ************     * ***                                               +-|   
          |********** ******************** **************  ** * *******  **                                         |   
          |********** ******************** ******************** ****************** ************ *** ***  ***       *|   
          |********** ******************** ******************** ************ *** *** ****** * *** *** **** *********|   
        0 +---------------------------------------------------------------------------------------------------------+   
          10   15   20    25   30   35   40    45   50   55   60    65   70   75   80    85   90   95  100   105  110   
                                                          Mean Depth                                                    
                                                                                                                        
If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be 10141.7555
The 95% cutoff would be 101
Would you like to use a different maximum mean depth cutoff than 101, yes or no
no
Number of sites filtered based on maximum mean depth
 81 of 949 

Number of sites filtered based on within locus depth mismatch
 36 of 868 

Total number of sites filtered
 1309 of 2141 

Remaining sites
 832 

Filtered VCF file is called rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf.FIL.recode.vcf

Filter stats stored in rawg.1mac1dp3allpop.1.ABfil.MQfil.MapFil.dDFil.vcf.filterstats
(ddocent_env) ecrandall@Argonaute:~/eric_data/ptuberculosa/filtering$ 

This script apparently does many of the other steps that I did above manually, so I started from an earlier checkpoint

dDocent_filters  rawg1mac1dp3allpops.1.vcf.recode.vcf  rawg.1mac1dp3allpop.1.dDfil.vcf 
This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth,
quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation.
The script assumes that loci and individuals with low call rates (or depth) have already been removed. 

Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters 

Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth
 23877 of 25261 

Are reads expected to overlap?  In other words, is fragment size less than 2X the read length?  Enter yes or no.
yes
Is this from a mixture of SE and PE libraries? Enter yes or no.
no
Number of additional sites filtered based on properly paired status
 197 of 1384 

Number of sites filtered based on high depth and lower than 2*DEPTH quality score
 1673 of 1187 

                                                                                                                        
                                                                                                                        
                                               Histogram of mean depth per site                                         
       60 +---------------------------------------------------------------------------------------------------------+   
          | +    +     +    +     +    +    +     +    +     +    +     +    +    +     +    +     +    +     +    +|   
          |                                               'meandepthpersite' using (bin($1,binwidth)):(1.0) ******* |   
          |           **                                                                                            |   
       50 |-+         **                                                                                          +-|   
          |           **                                                                                            |   
          |        ** **                                                                                            |   
          |        ** **   **                                                                                       |   
       40 |-+      ** **   **                                                                                     +-|   
          |    **  ** **   **                                                                                       |   
          |    ** *** ***  ** **                                                                                    |   
       30 |-+  *********** *****                                                                                  +-|   
          | ** *********** *****                                                                                    |   
          | ** *****************                                                                                    |   
          | ********************                                                                                    |   
       20 |-********************  **                                                                              +-|   
          |********************** ***                                                                               |   
          |********************** ***                                                                               |   
          |****************************     *                                                                       |   
       10 |**************************** *** *                                                                     +-|   
          |******************************** **   **    ***** **                                                     |   
          |**************************************************** **       **         ***                             |   
          |************************************************************************** ****  **  ** +  *** **  +*****|   
        0 +---------------------------------------------------------------------------------------------------------+   
            12   18    24   30    36   42   48    54   60    66   72    78   84   90    96  102   108  114   120  126   
                                                          Mean Depth                                                    
                                                                                                                        
If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be 11589.30825
The 95% cutoff would be 116
Would you like to use a different maximum mean depth cutoff than 116, yes or no
no
Number of sites filtered based on maximum mean depth
 109 of 1187 

Number of sites filtered based on within locus depth mismatch
 21 of 1072 

Total number of sites filtered
 24210 of 25261 

Remaining sites
 1051 

Filtered VCF file is called rawg.1mac1dp3allpop.1.dDfil.vcf.FIL.recode.vcf

Filter stats stored in rawg.1mac1dp3allpop.1.dDfil.vcf.filterstats

mv rawg.1mac1dp3allpop.1.dDfil.vcf.FIL.recode.vcf rawg.1mac1dp3allpop.1.dDfil.vcf

Now to filter by HWE

The next filter to apply is HWE. Heng Li also found that HWE is another excellent filter to remove erroneous variant calls. We don’t want to apply it across the board, since population structure will create departures from HWE as well. We need to apply this by population. I’ve included a perl script written by Chris Hollenbeck, one of the PhD student’s in my current lab that will do this for us.

Let’s filter our SNPs by population specific HWE First, we need to convert our variant calls to SNPs To do this we will use another command from vcflib called vcfallelicprimatives

vcfallelicprimitives rawg.1mac1dp3allpop.1.dDfil.vcf --keep-info --keep-geno > rawg.1mac1dp3allpop.1.dDfil.prim.vcf

This will decompose complex variant calls into phased SNP and INDEL genotypes and keep the INFO flags for loci and genotypes. Next, we can feed this VCF file into VCFtools to remove indels.

vcftools --vcf rawg.1mac1dp3allpop.1.dDfil.prim.vcf --remove-indels --recode --recode-INFO-all --out SNP.rawg.1mac1dp3allpop.1.dDfil

Outputting VCF file… After filtering, kept 1096 out of a possible 1126 Sites Run Time = 0.00 seconds

Now apply the HWE filter, in filterhwe folder

filter_hwe_by_pop.pl -v ../SNP.rawg.1mac1dp3allpop.1.dDfil.recode.vcf -p ../../popmap -o SNP.rawg.1mac1dp3allpop.1.dDfil.HWE -h 0.01 -c 0.1

<!-- Processing population: BigIsland (17 inds) -->
<!-- Processing population: FrenchFrigateShoals (9 inds) -->
<!-- Processing population: Kauai (15 inds) -->
<!-- Processing population: Kure (10 inds) -->
<!-- Processing population: MaroReef (5 inds) -->
<!-- Processing population: Maui (10 inds) -->
<!-- Processing population: Molokai (10 inds) -->
<!-- Processing population: Oahu (10 inds) -->
<!-- Processing population: Pearl_Hermes (4 inds) -->
<!-- Processing population: PioneerBanks (3 inds) -->
<!-- Outputting results of HWE test for filtered loci to 'filtered.hwe' -->
<!-- Kept 997 of a possible 1096 loci (filtered 99 loci) -->

Hmm… the HWE filter is not working great, likely because of low sample sizes. It’s leaving me with mostly loci without individuals with alternate homozygotes

conda deactivate

Haplotyping

rad_haplotyper.pl

Chris Hollenbeck has created a script that can get haplotypes from dDocent output. First, gotta remove the -RG from the .bam and .bai files.

for f in *.bam; do mv "$f" "${f%-RG.bam}.bam" ; done
for f in *.bam.bai; do  mv "$f" "${f%-RG.bam.bai}.bam.bai" ; done

Now, to install the rad_haplotyper environment following these instructions

hmmm … rad_haplotyper isn’t working because the Vcf Perl module changed its name to VCF… breaking the script. I tried a few minor modifications but couldn’t fix it. Created an issue in the github site.

Remember to remove the environment: $ conda remove rad_haplotyper_env

Microhaplot

I am now going to try microhaplot by Thomas Ng and Eric Anderson!!

Now, have to convert the bam files to sam files.

for bam in *.bam                                                                                                      
do
echo $bam
samtools view -h -o ../sam_files/${bam%.bam}.sam $bam
done

for bam in *.bam                                                                                                      
do
echo $bam
mv $bam ${bam%-RG.bam}.bam
done

for bam in *.bai                                                                                                      
do
echo $bam
mv $bam ${bam%-RG.bam.bai}.bam.bai
done

for bam in *.bam                                                                                                      
do
echo $bam
samtools idxstats $bam > ./idxstats/${bam%.bam}_idxstats.txt
done


for bam in *.bam                                                                                                      
do
echo $bam
samtools view -h -o /Users/edc5240/Datasets/Ptuberculosa/sam_files/${bam%.bam}.sam $bam
done

Tutorial

Do the tutorial

library(microhaplot)
shiny_dir <- "/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/Ptuberculosa_data/shiny_dir"
microhaplot::mvShinyHaplot(shiny_dir)

app.path <- file.path(shiny_dir, "microhaplot")
microhaplot::runShinyHaplot(app.path)

Load Data

Then follow along here to get data into microhaplot.

library(microhaplot)

run.label <- "Ptuberculosa"

path <- "/Users/edc5240/Datasets/Ptuberculosa"
sam.path <- "/Users/edc5240/Datasets/Ptuberculosa/sam_files"
# untar(system.file("extdata",
#                   "sebastes_sam.tar.gz",
#                   package="microhaplot"),
#       exdir = sam.path)


label.path <- file.path(path, "labels.txt")
vcf.path <- file.path(path, "mm0.9minq30mac3thin500.recode.vcf")

mvShinyHaplot(file.path(path,"shiny_dir"))
app.path <- file.path(path,"shiny_dir", "microhaplot")

haplo.read.tbl <- prepHaplotFiles(run.label = run.label,
                            sam.path = sam.path,
                            out.path = path,
                            label.path = label.path,
                            vcf.path = vcf.path,
                            app.path = app.path)

runShinyHaplot(path = "/Users/edc5240/Datasets/Ptuberculosa/shiny_dir/microhaplot")

Working with the data in microhaplot was OK, but I kept getting unexplained N’s in the output, and it was going to take some coding effort to translate the output into migrate format. Moreover, microhaplot only kept the variable sites, which isn’t ideal for migrate’s mutation model. So I am giving up on this front, and am going to try to generate haplotypes with Stacks.

Haplotypes from Stacks

Setup

Need first to copy all the fastq files over from Google Drive, where ’Ale’a put them. I split them into 4 tranches to keep the number of files under 50 as required by gdown. Note that the folder protocol won’t take urls, but just the ID.

gdown --id 1ES3yMzZS8JkWROiE1jdfTp6KBjxDdCXO --folder --remaining-ok --output firstbatch
gdown --id 1Ofta4RLcf6vddIiAA6dLCkdrYngADW1d --folder --remaining-ok --output secondbatch
gdown --id 1Og6JQNlhgprYvLTxJtMBlCCj72qRxLOw --folder --remaining-ok --output thirdbatch
gdown --id 1Ojnlj3L_8UFoPUVxaqvLKTLs8qq_S9ml --folder --remaining-ok --output fourthbatch

Need to rename the files into Illumina format in order for Stacks to recognize the paired reads

cd raw

for f in *.F.fq.gz
do
mv $f ${f/_1.F.fq.gz/_R1_001.fastq.gz}
done

for f in *.R.fq.gz
do
mv $f ${f/_1.R.fq.gz/_R2_001.fastq.gz}
done

cd ..

Process RadTags

’Ale’a said:

It was an ezRAD protocol developed by the Tobo lab. The restriction enzymes used were both MboI and Sau3AI. The sequences have indexes removed but still need to be trimmed/ends cleaned.

So I will process the radtags to remove adapters and the sau3AI cut site (GATC, which is the same as MboI). This command cleans data removing reads with uncalled bases, low phred scores, rescue barcodes and rad-tags.


process_radtags -p ./raw/ -o ./cleaned/ --rescue --clean --quality --paired --filter_illumina --renz-1 sau3AI \
--adapter-1 AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \
--adapter-2 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT

724964572 total sequences 0 failed Illumina filtered reads (0.0%) 227835784 reads contained adapter sequence (31.4%) 0 barcode not found drops (0.0%) 28941251 low quality read drops (4.0%) 63374437 RAD cutsite not found drops (8.7%) 404813100 retained reads (55.8%)

Pipeline

Hmm… gotta rename the cleaned files too

cd cleaned
for f in *001.1.fq.gz
do
mv $f ${f/_R[12]_001.1.fq.gz/.1.fq.gz}
done

for f in *001.2.fq.gz
do
mv $f ${f/_R[12]_001.2.fq.gz/.2.fq.gz}
done

I’m going to use the denovo_map.pl pipeline. This command will take data from ./cleaned and the popmap I created for dDocent and -m 3 reads per stack, -n 4 distance between stacks, -M 4 distance between catalog loci. Running on 12 -T threads and only keeping loci that appear in 75% of individuals in all 10 populations

denovo_map.pl --samples ./cleaned/ --popmap ../popmap --out-path ./pipeline --paired \
-m 3 -n 4 -M 4 -T 12 -r 75 -p 10 -X "populations: --fasta-samples" -X "populations: --filter-haplotype-wise"

Copy it down

scp -r  ecrandall@128.118.123.64:eric_data/ptuberculosa/stacks/pipeline/output1 ./stacks_output              

Populations

Now I need to re-run populations to get more statistics. Original populations output is in output1, this will be in output2.

populations -P ../ -O ./ -M ../../../popmap -t 12 -p 10 -r 75 -H --hwe --fstats --p-value-cutoff 0.99 --fasta-loci --fasta-samples --vcf --genepop --structure --treemix --fasta-samples-raw 

Fasta2Genotype

Paul Maier created this script to convert Stacks haplotypes to migrate format. I had to remove all the [individualName] tokens from the populations.samples.fa to get it to work.

python2 /Applications/migrate/migrate-4.4.4/fasta2genotype/fasta2genotype.py populations.samples2.fa NA ../popmap.tsv  populations.snps.vcf ptuberculosa.mig
###################################################################
###                                                             ###
###       Fasta2Genotype | Data Conversion | Version 1.10       ###
###                                                             ###
###                        Cite as follows:                     ###
###                                                             ###
###   Maier P.A., Vandergast A.G., Ostoja S.M., Aguilar A.,     ###
###   Bohonak A.J. (2019). Pleistocene glacial cycles drove     ###
###   lineage diversification and fusion in the Yosemite toad   ###
###   (Anaxyrus canorus). Evolution, in press.                  ###
###   https://www.doi.org/10.1111/evo.13868                     ###
###                                                             ###
###################################################################

Output type? [1] Migrate [2] Arlequin [3] DIYABC [4] LFMM [5] Phylip [6] G-Phocs [7] Treemix [8] Haplotype: 1
Remove restriction enzyme or adapter sequences? These may bias data. [1] Yes [2] No: 2
Coverage Cutoff (number reads for locus)? Use '0' to ignore coverage: 0
Remove monomorphic loci? [1] Yes [2] No: 2
Remove loci with excess heterozygosity? This can remove paralogs. [1] Yes [2] No: 1
Maximum heterozygosity cutoff for removing loci out of Hardy-Weinberg? 0.5
Filter for allele frequency? False alleles might bias data. [1] Yes [2] No: 2
Filter for missing genotypes? These might bias data. [1] Yes [2] No: 2
 
**************************************************************************************************************
***                                       ... BEGINNING CONVERSION ...                                     ***
**************************************************************************************************************
 
Cataloging loci...
Counting locus lengths...
Cataloging populations...
Counting gene copies...
Counting alleles for each locus...
Identifying loci with excess heterozygosity...
     Calculating observed heterozygosity and homozygosity...
     Calculating expected heterozygosity and homozygosity...
     Flagging loci with excess heterozygosity for removal...
     Removing loci...
     Removed 2 overly heterozygous loci.
Outputting migrate-n file...
*** DONE! ***

Then I manually re-ordered the populations to run from north to south in the output file, and moved it into the migrate folder.

Migrate

Install Migrate

Install the parallel version of Migrate on Nautilus server. I hade to remove -fvectorize from line 100 of the makefile to get this to work.

curl https://peterbeerli.com/migrate-html5/download_version4/migrate-newest-src.tar.gz > migrate.tar.gz
tar -xvf migrate.tar.gz
cd migrate-4.4.4/
cd src
./configure
make mpis (for parallel running)
sudo cp migrate-n-mpi /usr/local/bin/migrate-n-mpi

Locus Statistics & Mutation Model

These RAD loci are much less variable than COI that I’m used to using. I need to figure out an overall mutation model to use with RAD loci. I can’t find any discussion of this issue on the Migrate Google Group, so I created one. I guess I’ll use modelTest in the phangorn package to see where that gets me.

I renamed the FASTA headers in populations.samples.fa with BBEdit:

Find: >CLocus_\d+_Sample_\d+_Locus_(\d+)_Allele_([01]) \[(.+)\]
Replace: >\3_L\1_A\2

Lets load in the data and calculate some statistics for each locus. Previously Migrate-n only implemented the F84 (=HKY) model, with two rates (Transistions and Transversions) and gamma distributed rate variability. The new v4 has a datamodel parameter that suggests it might take other models of molecular evolution, but the possible models are not listed in the documentation! So I will just fit an HKY model.

note the interface lists these possible models: 1 Jukes-Cantor model 2 Kimura 2-parameter model 3 Felsenstein 1981 (F81) model 4 Felsenstein 1984 (F84) model 5 Hasegawa-Kishino-Yano model 6 Tamura-Nei model

fastadata <- read.FASTA("/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/allpops75_hwe/populations.samples_rename.fasta")

# have to read in the locus lengths, because this is the only way to make sure the stats
# and the migrate infile are ordered the same
migrate_lengths <- read_lines("/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/ptuberculosa.mig", skip = 1, n_max=1) %>% 
  str_split("\\t", simplify = T) %>%  t(.) %>% tibble(length=.)

# Get locus names. fasta2genotype orders them alphabetically, so do this here too
locus_names <- str_extract(names(fastadata),"L\\d+") %>% unique() %>% sort()

stats <- tibble(locus = character(), 
                length = numeric(),
                segSites = numeric(),
                nHaps = numeric(),
                nucDiv = numeric(),
                ttRatio = numeric(),
                model = character(),
                gammaShape = numeric(),
                rate1 = numeric(), 
                rate2 = numeric(), 
                rate3 = numeric(),
                rate4 = numeric(),
                Q1= numeric(),
                Q2 = numeric(),
                Q3 = numeric(),
                )
                
for(l in locus_names){
  print(paste("Now Doing Locus", l, match(l, locus_names), "out of", length(locus_names)))
  locus_dnabin <- fastadata[str_which(names(fastadata),pattern = l)]
  # convert to package formats
  locus_dnabin <- as.matrix(locus_dnabin)
  locus_gtypes <- sequence2gtypes(locus_dnabin)
  locus_phy <- phyDat(locus_dnabin)
  #create a haplotype network .. to be continued
  haps <- haplotype(locus_dnabin)
  nhaps <- length(dimnames(haps)[[1]])
  #tcs <- haploNet(haps)
  #find parameters of HKY (F84) model
  modeltest <- modelTest(locus_phy, model = c("JC","K80", "F81", "HKY","TrN"), 
                         G = T, I = F, k = 4, mc.cores = 4)
  # pick out the best model. If multiple models are tied for best, pick the simplest one
  bestmodel <- modeltest$Model[which(modeltest$BIC == min(modeltest$BIC))][1]
  #open the object environment
  env <- attr(modeltest,"env")
  bestparams <- get(bestmodel, env)
  bestparams <- eval(bestparams, env=env)
  # use this code for v3, which only has F84 (HKY)
  #HKY <- modelTest(locus_phy, model = c("HKY"), 
  #                       G = T, I = F, k = 4)
  #env <- attr(HKY, "env")
  #HKYG <- get("HKY+G", env)
  #model <- eval(HKYG, env=env)
  # calculate TiTv Ratio
  ttratio <- TiTvRatio(locus_gtypes)
  
  stats <- bind_rows(stats, tibble(locus=l, 
                          length = length(locus_dnabin[1,]),
                          segSites = length(seg.sites(locus_dnabin)),
                          nHaps = length(dimnames(haps)[[1]]),
                          nucDiv = nuc.div(locus_dnabin),
                          ttRatio =  ttratio[3],
                          model = bestmodel,
                          gammaShape = bestparams$shape,
                          rate1 = bestparams$g[1],
                          rate2 = bestparams$g[2],
                          rate3 = bestparams$g[3],
                          rate4 = bestparams$g[4],
                          Q1 = sort(unique(bestparams$Q))[1],
                          Q2 = sort(unique(bestparams$Q))[2],
                          Q3 = sort(unique(bestparams$Q))[3]
                          ))
                         
                        
}

stats <- stats[which(stats$length %in% migrate_lengths$length),]
#setdiff(stats$length, as.numeric(migrate_lengths$length))
# write_csv(stats, "./migrate/run4_locus_statistics.csv")

Write a model block

Write a space delimited textfile that can be added to the migrate data file in the format that Peter suggested.

stats <- read_csv("./migrate/run4_locus_statistics.csv")
## Rows: 109 Columns: 15── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, model
## dbl (9): length, segSites, nHaps, nucDiv, ttRatio, gammaShape, rate1, Q1, Q2
## lgl (4): rate2, rate3, rate4, Q3
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(stats)
locus length segSites nHaps nucDiv ttRatio model gammaShape rate1 rate2 rate3 rate4 Q1 Q2 Q3
L10061 411 1 2 0.0000280 Inf F81 1 1 NA NA NA 1 NA NA
L1096075 388 7 8 0.0002340 6.0000000 K80 1 1 NA NA NA 1 12.02642 NA
L11008 472 2 6 0.0000000 1.0000000 JC 1 1 NA NA NA 1 NA NA
L11074 468 2 3 0.0000491 0.0000000 JC 1 1 NA NA NA 1 NA NA
L11315 499 4 5 0.0001620 1.0000000 F81 1 1 NA NA NA 1 NA NA
L11796 516 1 2 0.0000239 Inf F81 1 1 NA NA NA 1 NA NA
L1180 496 5 6 0.0001581 1.5000000 F81 1 1 NA NA NA 1 NA NA
L12220 337 2 4 0.0000346 Inf F81 1 1 NA NA NA 1 NA NA
L1257 512 3 9 0.0000000 0.5000000 JC 1 1 NA NA NA 1 NA NA
L12728 618 1 2 0.0000180 0.0000000 JC 1 1 NA NA NA 1 NA NA
L13246 387 2 3 0.0001455 0.0000000 JC 1 1 NA NA NA 1 NA NA
L1334 394 3 6 0.0000287 2.0000000 JC 1 1 NA NA NA 1 NA NA
L13955 500 4 5 0.0000460 0.3333333 F81 1 1 NA NA NA 1 NA NA
L14063 726 0 1 0.0000000 NA JC 1 1 NA NA NA 1 NA NA
L14455 479 2 3 0.0000491 1.0000000 JC 1 1 NA NA NA 1 NA NA
L14518 338 2 2 0.0000688 1.0000000 F81 1 1 NA NA NA 1 NA NA
L14522 382 2 4 0.0000302 Inf F81 1 1 NA NA NA 1 NA NA
L14706 316 4 5 0.0001836 3.0000000 JC 1 1 NA NA NA 1 NA NA
L14759 659 0 1 0.0000000 NA F81 1 1 NA NA NA 1 NA NA
L14943 478 2 3 0.0000481 1.0000000 JC 1 1 NA NA NA 1 NA NA
L1558 446 5 4 0.0001437 1.5000000 JC 1 1 NA NA NA 1 NA NA
L15879 353 11 10 0.0003399 0.5714286 F81 1 1 NA NA NA 1 NA NA
L16369 397 1 2 0.0000303 0.0000000 F81 1 1 NA NA NA 1 NA NA
L17180 729 3 4 0.0000473 0.5000000 JC 1 1 NA NA NA 1 NA NA
L1747065 269 0 1 0.0000000 NA JC 1 1 NA NA NA 1 NA NA
L18283 413 2 4 0.0000279 1.0000000 F81 1 1 NA NA NA 1 NA NA
L18719 463 5 7 0.0001285 1.5000000 F81 1 1 NA NA NA 1 NA NA
L1875 741 1 3 0.0000000 Inf F81 1 1 NA NA NA 1 NA NA
L19311 701 1 3 0.0000000 0.0000000 F81 1 1 NA NA NA 1 NA NA
L19853 370 1 3 0.0000000 0.0000000 F81 1 1 NA NA NA 1 NA NA
L19963 725 0 1 0.0000000 NA JC 1 1 NA NA NA 1 NA NA
L2058296 470 1 2 0.0000253 Inf F81 1 1 NA NA NA 1 NA NA
L21436 335 5 7 0.0001023 1.5000000 F81 1 1 NA NA NA 1 NA NA
L21548 261 1 2 0.0000435 0.0000000 JC 1 1 NA NA NA 1 NA NA
L2158295 433 0 1 0.0000000 NA F81 1 1 NA NA NA 1 NA NA
L23288 328 4 6 0.0001452 3.0000000 JC 1 1 NA NA NA 1 NA NA
L2388 313 2 3 0.0000770 1.0000000 JC 1 1 NA NA NA 1 NA NA
L24817 314 4 6 0.0001114 3.0000000 F81 1 1 NA NA NA 1 NA NA
L25264 678 2 3 0.0000335 1.0000000 F81 1 1 NA NA NA 1 NA NA
L2530346 455 2 4 0.0000250 1.0000000 F81 1 1 NA NA NA 1 NA NA
L25318 338 4 5 0.0002392 1.0000000 JC 1 1 NA NA NA 1 NA NA
L25398 462 7 11 0.0000762 1.3333333 F81 1 1 NA NA NA 1 NA NA
L2555271 784 2 2 0.0000304 1.0000000 F81 1 1 NA NA NA 1 NA NA
L25605 325 2 3 0.0000691 1.0000000 F81 1 1 NA NA NA 1 NA NA
L25623 489 4 5 0.0002795 1.0000000 JC 1 1 NA NA NA 1 NA NA
L2583 370 1 2 0.0000307 Inf F81 1 1 NA NA NA 1 NA NA
L26422 316 2 3 0.0000745 1.0000000 JC 1 1 NA NA NA 1 NA NA
L26429 505 1 2 0.0000236 Inf F81 1 1 NA NA NA 1 NA NA
L26556 437 1 2 0.0000272 0.0000000 JC 1 1 NA NA NA 1 NA NA
L26579 264 2 3 0.0001301 Inf JC 1 1 NA NA NA 1 NA NA
L26712 340 1 2 0.0000346 0.0000000 JC 1 1 NA NA NA 1 NA NA
L26969 345 0 1 0.0000000 NA F81 1 1 NA NA NA 1 NA NA
L27012 369 4 5 0.0003109 0.3333333 JC 1 1 NA NA NA 1 NA NA
L2717 433 8 9 0.0002359 0.6000000 JC 1 1 NA NA NA 1 NA NA
L27287 273 7 7 0.0002199 1.3333333 F81 1 1 NA NA NA 1 NA NA
L27423 469 3 5 0.0000000 0.5000000 JC 1 1 NA NA NA 1 NA NA
L27522 275 2 3 0.0000834 Inf F81 1 1 NA NA NA 1 NA NA
L30368 409 2 6 0.0000000 1.0000000 JC 1 1 NA NA NA 1 NA NA
L31865 471 2 3 0.0001252 Inf F81 1 1 NA NA NA 1 NA NA
L32562 379 1 2 0.0000307 0.0000000 F81 1 1 NA NA NA 1 NA NA
L327 621 3 6 0.0000184 0.5000000 F81 1 1 NA NA NA 1 NA NA
L33047 473 4 6 0.0000739 1.0000000 JC 1 1 NA NA NA 1 NA NA
L33831 579 8 15 0.0001168 1.0000000 JC 1 1 NA NA NA 1 NA NA
L34090 302 4 6 0.0001541 3.0000000 JC 1 1 NA NA NA 1 NA NA
L34388 634 5 6 0.0001086 4.0000000 JC 1 1 NA NA NA 1 NA NA
L34881 782 0 1 0.0000000 NA JC 1 1 NA NA NA 1 NA NA
L35043 454 1 2 0.0000253 Inf JC 1 1 NA NA NA 1 NA NA
L35059 541 0 1 0.0000000 NA F81 1 1 NA NA NA 1 NA NA
L35068 411 0 1 0.0000000 NA F81 1 1 NA NA NA 1 NA NA
L35167 534 2 4 0.0000208 1.0000000 F81 1 1 NA NA NA 1 NA NA
L35665 349 4 5 0.0002433 3.0000000 F81 1 1 NA NA NA 1 NA NA
L35833 399 11 15 0.0002354 0.5714286 JC 1 1 NA NA NA 1 NA NA
L3586 242 2 4 0.0000461 1.0000000 JC 1 1 NA NA NA 1 NA NA
L37981 505 2 3 0.0000665 Inf JC 1 1 NA NA NA 1 NA NA
L38520 398 1 2 0.0000306 0.0000000 JC 1 1 NA NA NA 1 NA NA
L38950 308 4 6 0.0001428 Inf K80 1 1 NA NA NA 1 22026.46579 NA
L3961 469 0 1 0.0000000 NA JC 1 1 NA NA NA 1 NA NA
L39704 338 1 2 0.0000336 0.0000000 JC 1 1 NA NA NA 1 NA NA
L40411 429 1 2 0.0000274 Inf JC 1 1 NA NA NA 1 NA NA
L40464 433 4 5 0.0001038 1.0000000 JC 1 1 NA NA NA 1 NA NA
L41423 395 5 9 0.0002024 4.0000000 JC 1 1 NA NA NA 1 NA NA
L41848 376 7 8 0.0002529 6.0000000 K80 1 1 NA NA NA 1 12.02727 NA
L4304 473 2 3 0.0000516 1.0000000 JC 1 1 NA NA NA 1 NA NA
L43331 524 3 6 0.0000215 Inf HKY 1 1 NA NA NA 1 22026.46579 NA
L43460 427 10 9 0.0003668 1.5000000 JC 1 1 NA NA NA 1 NA NA
L44102 650 0 1 0.0000000 NA F81 1 1 NA NA NA 1 NA NA
L4434 428 1 2 0.0000285 0.0000000 F81 1 1 NA NA NA 1 NA NA
L4438 419 8 11 0.0002329 1.6666667 JC 1 1 NA NA NA 1 NA NA
L4442 194 2 3 0.0001213 0.0000000 F81 1 1 NA NA NA 1 NA NA
L44615 699 2 4 0.0000167 1.0000000 JC 1 1 NA NA NA 1 NA NA
L44617 621 1 3 0.0000000 0.0000000 JC 1 1 NA NA NA 1 NA NA
L44756 515 1 2 0.0000213 Inf F81 1 1 NA NA NA 1 NA NA
L44796 494 0 1 0.0000000 NA JC 1 1 NA NA NA 1 NA NA
L45231 502 1 2 0.0000221 Inf JC 1 1 NA NA NA 1 NA NA
L45477 305 0 1 0.0000000 NA F81 1 1 NA NA NA 1 NA NA
L45671 382 6 8 0.0001508 2.0000000 JC 1 1 NA NA NA 1 NA NA
L45707 338 4 4 0.0001345 1.0000000 JC 1 1 NA NA NA 1 NA NA
L46674 572 3 4 0.0000632 0.5000000 F81 1 1 NA NA NA 1 NA NA
L47664 324 2 4 0.0000356 0.0000000 JC 1 1 NA NA NA 1 NA NA
L49386 439 6 7 0.0001830 5.0000000 K80 1 1 NA NA NA 1 10.01885 NA
L4950 596 0 1 0.0000000 NA F81 1 1 NA NA NA 1 NA NA
L5059 337 1 2 0.0000341 Inf F81 1 1 NA NA NA 1 NA NA
L50690 374 1 2 0.0000326 0.0000000 JC 1 1 NA NA NA 1 NA NA
L60865 487 11 13 0.0002451 1.7500000 F81 1 1 NA NA NA 1 NA NA
L754773 366 0 1 0.0000000 NA JC 1 1 NA NA NA 1 NA NA
L8862 317 3 3 0.0001534 Inf K80 1 1 NA NA NA 1 22026.46579 NA
L905 399 0 1 0.0000000 NA JC 1 1 NA NA NA 1 NA NA
L9578 333 1 2 0.0000337 Inf F81 1 1 NA NA NA 1 NA NA
L9984 347 3 4 0.0000961 2.0000000 JC 1 1 NA NA NA 1 NA NA
# write a space delimited textfile that can be added to the migrate data file
# following Peter's suggestion here:
#https://groups.google.com/g/migrate-support/c/XjV_4jZW4RI/m/HbRWoGY6AwAJ
migrate_mutation_models <- tibble(prefix = "#$",
                                 locus=1:length(stats$locus),
                                 type = "s",
                                 model = stats$model,
                                 q1 = stats$Q2,
                                 q2 = stats$Q3)

#write_delim(migrate_mutation_models,"./migrate/run4_modelblock.txt", na="", delim = " ")

Read them back in so we don’t have to recalculate them every time I knit.

stats <- read_csv("migrate/run2/run2_locus_statistics.csv")
## Rows: 111 Columns: 11── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): locus
## dbl (10): length, segSites, nHaps, nucDiv, ttRatio, gammaShape, rate1, rate2...
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(stats)
locus length segSites nHaps nucDiv ttRatio gammaShape rate1 rate2 rate3 rate4
L327 621 3 6 0.0000184 0.5000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L878 421 1 3 0.0000000 Inf 999.9859084 0.9600946 0.9894494 1.0099791 1.040477
L905 399 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L1180 496 5 6 0.0001581 1.5000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L1201 384 1 3 0.0000000 Inf 999.9955550 0.9600948 0.9894494 1.0099791 1.040477
L1257 512 3 9 0.0000000 0.5000000 999.9955549 0.9600948 0.9894494 1.0099791 1.040477
L1334 394 3 6 0.0000287 2.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L1558 446 5 4 0.0001437 1.5000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L1875 741 1 3 0.0000000 Inf 999.9932484 0.9600948 0.9894494 1.0099791 1.040477
L2388 313 2 3 0.0000770 1.0000000 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L2583 370 1 2 0.0000307 Inf 999.9465989 0.9600939 0.9894491 1.0099793 1.040478
L2717 433 8 9 0.0002359 0.6000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L3586 242 2 4 0.0000461 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L3961 469 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L4304 473 2 3 0.0000516 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L4434 428 1 2 0.0000285 0.0000000 999.9918294 0.9600948 0.9894494 1.0099791 1.040477
L4438 419 8 11 0.0002329 1.6666667 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L4442 194 2 3 0.0001213 0.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L4950 596 0 1 0.0000000 NA 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L5059 337 1 2 0.0000341 Inf 999.9922066 0.9600948 0.9894494 1.0099791 1.040477
L8862 317 3 3 0.0001534 Inf 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L9578 333 1 2 0.0000337 Inf 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L9984 347 3 4 0.0000961 2.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L10061 411 1 2 0.0000280 Inf 999.8864506 0.9600927 0.9894488 1.0099796 1.040479
L11008 472 2 6 0.0000000 1.0000000 999.9955550 0.9600948 0.9894494 1.0099791 1.040477
L11074 468 2 3 0.0000491 0.0000000 999.9955550 0.9600948 0.9894494 1.0099791 1.040477
L11315 499 4 5 0.0001620 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L11796 516 1 2 0.0000239 Inf 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L12220 337 2 4 0.0000346 Inf 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L12728 618 1 2 0.0000180 0.0000000 999.9896640 0.9600947 0.9894494 1.0099791 1.040477
L13246 387 2 3 0.0001455 0.0000000 999.9955549 0.9600948 0.9894494 1.0099791 1.040477
L13955 500 4 5 0.0000460 0.3333333 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L14063 726 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L14455 479 2 3 0.0000491 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L14518 338 2 2 0.0000688 1.0000000 999.9731827 0.9600944 0.9894493 1.0099792 1.040477
L14522 382 2 4 0.0000302 Inf 999.9955549 0.9600948 0.9894494 1.0099791 1.040477
L14706 316 4 5 0.0001836 3.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L14759 659 0 1 0.0000000 NA 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L14943 478 2 3 0.0000481 1.0000000 999.9960667 0.9600949 0.9894494 1.0099791 1.040477
L15879 353 11 10 0.0003399 0.5714286 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L16369 397 1 2 0.0000303 0.0000000 999.9956811 0.9600948 0.9894494 1.0099791 1.040477
L17180 729 3 4 0.0000473 0.5000000 999.9955550 0.9600948 0.9894494 1.0099791 1.040477
L18283 413 2 4 0.0000279 1.0000000 999.9942760 0.9600948 0.9894494 1.0099791 1.040477
L18719 463 5 7 0.0001285 1.5000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L19311 701 1 3 0.0000000 0.0000000 999.9955536 0.9600948 0.9894494 1.0099791 1.040477
L19853 370 1 3 0.0000000 0.0000000 999.9952890 0.9600948 0.9894494 1.0099791 1.040477
L19963 725 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L21436 335 5 7 0.0001023 1.5000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L21548 261 1 2 0.0000435 0.0000000 999.9955552 0.9600948 0.9894494 1.0099791 1.040477
L23288 328 4 6 0.0001452 3.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L24817 314 4 6 0.0001114 3.0000000 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L25264 678 2 3 0.0000335 1.0000000 999.9955551 0.9600948 0.9894494 1.0099791 1.040477
L25318 338 4 5 0.0002392 1.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L25398 462 7 11 0.0000762 1.3333333 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L25605 325 2 3 0.0000691 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L25623 489 4 5 0.0002795 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L26422 316 2 3 0.0000745 1.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L26429 505 1 2 0.0000236 Inf 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L26556 437 1 2 0.0000272 0.0000000 999.9873894 0.9600947 0.9894494 1.0099791 1.040477
L26579 264 2 3 0.0001301 Inf 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L26712 340 1 2 0.0000346 0.0000000 999.9829462 0.9600946 0.9894493 1.0099791 1.040477
L26969 345 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L27012 369 4 5 0.0003109 0.3333333 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L27287 273 7 7 0.0002199 1.3333333 999.9955545 0.9600948 0.9894494 1.0099791 1.040477
L27423 469 3 5 0.0000000 0.5000000 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L27522 275 2 3 0.0000834 Inf 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L30368 409 2 6 0.0000000 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L31865 471 2 3 0.0001252 Inf 999.9748906 0.9600944 0.9894493 1.0099792 1.040477
L32562 379 1 2 0.0000307 0.0000000 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L33047 473 4 6 0.0000739 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L33831 579 8 15 0.0001168 1.0000000 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L34090 302 4 6 0.0001541 3.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L34388 634 5 6 0.0001086 4.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L34881 782 0 1 0.0000000 NA 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L35043 454 1 2 0.0000253 Inf 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L35059 541 0 1 0.0000000 NA 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L35068 411 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L35167 534 2 4 0.0000208 1.0000000 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L35665 349 4 5 0.0002433 3.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L35833 399 11 15 0.0002354 0.5714286 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L37981 505 2 3 0.0000665 Inf 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L38520 398 1 2 0.0000306 0.0000000 999.9762472 0.9600945 0.9894493 1.0099792 1.040477
L38950 308 4 6 0.0001428 Inf 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L39704 338 1 2 0.0000336 0.0000000 999.9873889 0.9600947 0.9894494 1.0099791 1.040477
L40411 429 1 2 0.0000274 Inf 999.9759459 0.9600945 0.9894493 1.0099792 1.040477
L40464 433 4 5 0.0001038 1.0000000 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L41423 395 5 9 0.0002024 4.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L41848 376 7 8 0.0002529 6.0000000 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L43331 524 3 6 0.0000215 Inf 999.9955548 0.9600948 0.9894494 1.0099791 1.040477
L43460 427 10 9 0.0003668 1.5000000 999.9955545 0.9600948 0.9894494 1.0099791 1.040477
L44102 650 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L44615 699 2 4 0.0000167 1.0000000 999.9955549 0.9600948 0.9894494 1.0099791 1.040477
L44617 621 1 3 0.0000000 0.0000000 999.9718997 0.9600944 0.9894493 1.0099792 1.040477
L44756 515 1 2 0.0000213 Inf 999.9510658 0.9600940 0.9894492 1.0099793 1.040478
L44796 494 0 1 0.0000000 NA 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L45231 502 1 2 0.0000221 Inf 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L45477 305 0 1 0.0000000 NA 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L45671 382 6 8 0.0001508 2.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L45707 338 4 4 0.0001345 1.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L46674 572 3 4 0.0000632 0.5000000 999.9955550 0.9600948 0.9894494 1.0099791 1.040477
L47664 324 2 4 0.0000356 0.0000000 999.9955549 0.9600948 0.9894494 1.0099791 1.040477
L49386 439 6 7 0.0001830 5.0000000 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L50690 374 1 2 0.0000326 0.0000000 999.9785477 0.9600945 0.9894493 1.0099791 1.040477
L60865 487 11 13 0.0002451 1.7500000 999.9955546 0.9600948 0.9894494 1.0099791 1.040477
L754773 366 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L1096075 388 7 8 0.0002340 6.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L1747065 269 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L2058296 470 1 2 0.0000253 Inf 0.1059603 0.0000011 0.0015905 0.1101281 3.888280
L2158295 433 0 1 0.0000000 NA 1.0000000 0.1369538 0.4767519 1.0000000 2.386294
L2530346 455 2 4 0.0000250 1.0000000 999.9955547 0.9600948 0.9894494 1.0099791 1.040477
L2555271 784 2 2 0.0000304 1.0000000 0.1059603 0.0000011 0.0015905 0.1101281 3.888280

So we have a 16 invariant loci, and the mean overall transition:transversion ratio is 0.7927928. Mean gamma shape parameter is 765.8643026, which argues for only one rate.

Parmfile

I went through and compared my 3.x parmfile line by line to the default one generated by Migrate 4.4.4, and came up with this

################################################################################
# Parmfile for Migrate 4.4.4(git:v4-series-26-ge85c6ff)-June-1-2019 [do not remove these first TWO lines]
# generated automatically on
# Sat Jan 15 14:10:36 2022
#
# please report problems to Peter Beerli
#  email: beerli@fsu.edu
#  http://popgen.sc.fsu.edu/migrate.html
menu=YES
nmlength=10
datatype=SequenceData
datamodel=HKY
ttratio=1.000000

freqs-from-data=YES 

# data were filtered to Q30, so this seems like a good seqerror
seqerror-rate={0.0001,0.0001,0.0001,0.0001}
categories=1 #no categories file specified
rates=1: 1.000000 
prob-rates=1: 1.000000 
autocorrelation=NO
weights=NO
recover=NO
fast-likelihood=NO
inheritance-scalars={1.00000000000000000000}
haplotyping=YES:no-report
population-relabel={1 2 3 4 5 6 7 8 9 10}
infile=../../ptuberculosa.mig
random-seed=AUTO #OWN:410568459
title= Palythoa tuberculosa - Hawaii

progress=YES

logfile=NO

print-data=NO

outfile=outfile.txt

pdf-outfile=outfile.pdf

pdf-terse=NO

use-M=YES

print-tree=NONE

mig-histogram=NO
skyline=NO #needs mig-histogram=ALL:...

theta=PRIOR:50
migration=PRIOR:10
rate=PRIOR:50
split=PRIOR:10
splitstd=PRIOR:10

mutation=CONSTANT

analyze-loci=A

divergence-distrib=S

#standard stepping-stone model for starters
custom-migration={
**00000000
***0000000
0***000000
00***00000
000***0000
0000***000
00000***00
000000***0
0000000***
00000000**
}

geo=NO

updatefreq=0.200000 0.200000 0.200000 0.200000 #tree, parameter haplotype, timeparam updates
bayes-posteriorbins= 500 500
bayes-posteriormaxtype=TOTAL
bayes-file=YES:bayesfile
bayes-allfile=YES:bayesallfile
bayes-all-posteriors=YES:bayesallposterior
bayes-proposals= THETA METROPOLIS-HASTINGS Sampler
bayes-proposals= MIG SLICE Sampler
bayes-proposals= DIVERGENCE METROPOLIS-HASTINGS Sampler
bayes-proposals= DIVERGENCESTD METROPOLIS-HASTINGS Sampler
bayes-priors= THETA WEXPPRIOR: 0.0 0.01 0.1000000 0.01000 
bayes-priors= MIG WEXPPRIOR: 0.000100 100000.000000 1000000
bayes-priors= RATE * * UNIFORMPRIOR: 0.000000 10000000000.000000 1000000000.000000 
bayes-hyperpriors=NO
#
long-chains=1
long-inc=100
long-sample=10000
burn-in=2000  
auto-tune=YES:0.440000
assign=NO

heating=YES:1:{1.000000,1.500000,3.000000,1000000.000000}
heated-swap=YES

moving-steps=NO

gelman-convergence=No

replicate=NO

end

Test Run 1

Let’s see what happens! Started this run on Jan 22, at 10pm.

screen -S migrate_testrun

mpirun -np 32 ~/migrate-4.4.4/src/migrate-n-mpi parmfile

And it was finished the next morning! Prior on m was too wide, giving results like this:

Results

Wide Priors on m No bueno.

Test Run 2

Revise m prior

Revised the m prior like so (I had no window value in the original!) Order of values is min, mean, maximum, proposal window.

bayes-priors= MIG WEXPPRIOR: 0.000100 1000.000000 10000 100

Panmixia

A few modifications to make a panmictic model

population-relabel={1 1 1 1 1 1 1 1 1 1}
#and
custom-migration={
*
}

Island Model

Change all parameter values to m for island model.

custom-migration={
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
mmmmmmmmmm
}

Minor github issue. Got a little hung up at this point because I accidentally committed some bayesallfiles that were hundreds of Mb, and of course github wouldn’t let me upload those. Thought I fixed it with these instructions. Then ended up trying git-filter-repo (installed via homebrew), then ended up restoring the whole thing from the github repository. Ouch. Then it wouldn’t push until I ran git prune

Results

Model Selection

Very nice! 😃 Going to need to modify my code to read multilocus output from migrate-n.

workingDir <- "./migrate/run2"

modelMarglikes <- harvest.model.likelihoods(workingDir = workingDir)
## [1] "./migrate/run2/island"
## [1] "./migrate/run2/panmixia"
## [1] "./migrate/run2/stepping.stone"
kable(modelMarglikes, format.args = list(digits = 5))
model thermodynamic bezier.corrected harmonic
island -71972 -71120 -71813
panmixia -70400 -69492 -69655
stepping.stone -65995 -65136 -65790
results <- bfcalcs(modelMarglikes)

kable(results)
model thermodynamic bezier.corrected harmonic lbf choice modelprob
island -71971.81 -71120.19 -71812.53 -11967.82 3 0
panmixia -70400.30 -69492.18 -69654.60 -8711.80 2 0
stepping.stone -65995.43 -65136.28 -65789.54 0.00 1 1

Parameters

workingDir2 <- "/Volumes/GoogleDrive-103325533945714033178/My Drive/Ptuberculosa/stacks_output/migrate/testrun/run2"

winningModelDir <- file.path(workingDir2, "stepping.stone")

#this may take a minute or two
data <- read.table(file.path(winningModelDir,"bayesallfile"), header=T) 

#calculate migrants per generation
data.nm <- migrants.per.gen(data)

data %>% group_by(locus) %>% summarize()
# Split the whole list into the individual replicates, so that you will
# have a list of data frames
#data.list.1 <- split(data.nm,data$Replicate)

#split the data on locus
data.list.1 <- split(data.nm,data$Locus)
data.list.mcmc <- mcmc.list(lapply(data.list.1,mcmc))   
# Remove burnin if it wasn't removed in the run, where burnin is a proportion of the total.
# e.g. First 40% First 20,000 samples out of 50,000 or 10 million steps out of 25 million
#burnin <- 0.2
#data.list.1<-lapply(data.list.1,subset,subset=Steps>(burnin*max(data.list.1[[1]]$Steps)))

data.list.mcmc <- mcmc.list(data.list.1)
    
#convert each dataframe to an mcmc object and then convert the whole thing to an mcmc list

#collapse all replicates to one for purposes of highest posterior density
data.list.allinone <- mcmc(data=do.call("rbind",data.list.2)) 
    

summ <- summary(data.list.mcmc)
ess <- effectiveSize(data.list.mcmc)
gelman <- gelman.diag(data.list.mcmc,multivariate=F)
HPD <- HPDinterval(data.list.allinone)
    
#concatenate the stats
allstats<-cbind(summ$statistics[,1:2],HPD,ess,gelman$psrf)
#write.csv(allstats,file="codastats.csv")

First Analysis Run

’Ale’a and I settled on the following models

Models


#Panmixia (panmixia; 1 panmictic population)                        
population-relabel={1 1 1 1 1 1 1 1 1 1}                                
custom-migration=   *                           
                                
                                
                                
                                
                                
#n-Island (island; all populations same size, all exchange migrants)                                
population-relabel={1 2 3 4 5 6 7 8 9}                              
custom-migration=   
  Pop_Kure                  m   m   m   m   m   m   m 
    Pop_P&H                   m m   m   m   m   m   m
    Pop_MaroReef                m   m   m   m   m   m   m
    Pop_FFS                   m m   m   m   m   m   m
    Pop_Kauai                   m   m   m   m   m   m   m
    Pop_Oahu                    m   m   m   m   m   m   m
    Pop_Molokai               m m   m   m   m   m   m
    Pop_Maui                    m   m   m   m   m   m   m
    Pop_BigIsland               m   m   m   m   m   m   m
                                
                                
                                
                                
                                
                                
#Stepping-Stone (stepping.stone)                
population-relabel={1 2 3 4 5 6 7 8 9}                              
    Pop_Kure                    *   *   0   0   0   0   0   0   0
    Pop_P&H                     *   *   0   0   0   0   0   0   0
    Pop_MaroReef                0   *   *   *   0   0   0   0   0
    Pop_FFS                     0   0   *   *   *   0   0   0   0
    Pop_Kauai                   0   0   0   *   *   *   0   0   0
    Pop_Oahu                    0   0   0   0   *   *   *   0   0
    Pop_Molokai                 0   0   0   0   0   *   *   *   0
    Pop_Maui                    0   0   0   0   0   0   *   *   *
    Pop_BigIsland               0   0   0   0   0   0   0   *   *
                                
                                
                                
                                
                                
#Stepping-Stone with a Kure-Oahu connection (stepping.stone.KO)                             
population-relabel={1 2 3 4 5 1 7 8 9}                              
custom-migration=   
    Pop_Kure                    *   *   0   0   0   0   *   0   0
    Pop_P&H                     *   *   0   0   0   0   0   0   0
    Pop_MaroReef                0   *   *   *   0   0   0   0   0
    Pop_FFS                     0   0   *   *   *   0   0   0   0
    Pop_Kauai                   0   0   0   *   *   *   0   0   0
    Pop_Oahu                    *   0   0   0   *   *   *   0   0
    Pop_Molokai                 0   0   0   0   0   *   *   *   0
    Pop_Maui                    0   0   0   0   0   0   *   *   *
    Pop_BigIsland               0   0   0   0   0   0   0   *   *
                                
                                
                                
                                
#NWHI vs MHI    (NWHI_MHI; two panmictic regions)                   
population-relabel={1 1 1 1 2 2 2 2 2}                              
custom-migration=   
  Pop_NWHI                  *   *                   
    Pop_MHI                   * *                   
                                
                                
                                
                                
# Stepping Stone Breaks (stepping.stones.breaks; 
#  regional groups of Kure_PH vs MR_FFS vs MHI as lumped populations 
#  with stepping stone gene flow between them. )                                
population-relabel={1 1 2 2 3 4 5 6 7}                              
custom-migration=   
  Pop_Kure_PH                   *   *   0   0   0   0   0
    Pop_MaroReef_FFS                *   *   *   0   0   0   0
    Pop_Kauai                       0   *   *   *   0   0   0
    Pop_Oahu                        0   0   *   *   *   0   0
    Pop_Molokai                   0 0   0   *   *   *   0
    Pop_Maui                        0   0   0   0   *   *   *
    Pop_BigIsland                   0   0   0   0   0   *   *
                                
                                
                                
# Stepping Stone Breaks (stepping.stones.breaks; 
#  regional groups of Kure_PH vs MR_FFS vs MHI as lumped populations 
#  with stepping stone gene flow between them, and a Kure-Oahu connection.)                                 
population-relabel={1 1 2 2 3 4 5 6 7}                              
custom-migration=   
  Pop_Kure_PH                   *   *   0   *   0   0   0
    Pop_MaroReef_FFS                *   *   *   0   0   0   0
    Pop_Kauai                       0   *   *   *   0   0   0
    Pop_Oahu                        *   0   *   *   *   0   0
    Pop_Molokai                   0 0   0   *   *   *   0
    Pop_Maui                        0   0   0   0   *   *   *
    Pop_BigIsland                   0   0   0   0   0   *   *
      

I also turned menu = NO, and replicate=YES:3.

Bash Script

So we will do 10 replicates of 3 replicates. This will start at r1, and run all models for that before moving on. Pretty sure this will finish one whole model before moving on to the next one (since all threads are being used for different loci)

#!
for r in */
    do
        cd $r
        echo $r
        date
        date > date.txt
            for m in */
              do
                cd $m
                  date > date.txt
                  echo $m
                  date
                  mpirun -np 32 ~/migrate-4.4.4/src/migrate-n-mpi parmfile
                  sleep 1
                cd ..
              done
        cd ..
    done

Copy It Up

Copy it up, and then make a total of 10 replicate copies of the folder.

scp -r ./run4 ecrandall@nautilus.psu.edu:./Ptuberculosa/run4/r1    

#once on server, copy it 9 times
for a in $(seq 2 10); do cp -r rep1 rep$a; done

And back down

And ten replicates finished in about 10 days! Found this guide to download everything except the massive bayesallfiles and pdfs

rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf"  ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4 ./

rsync -av -e ssh --exclude='bayes*'  ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4/rep1/stepping.stone.all ./stepping.stone.KO.all

Results

Model Marginal Likelihoods

runDir <- "/Volumes/GoogleDrive/My Drive/ptuberculosa/migrate/run4"
#runDir <-"/Volumes/GoogleDrive-103325533945714033178/My Drive/ptuberculosa/migrate/run4"

likelist <- list()
for(r in 1:10){
  rep = paste0("rep",r)
  print(rep)
  likelist[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}

# Model selection for each replicate...
likelist %>% map(bfcalcs)

The final model marginal likelihood estimates based on the mean of 10 replicates:

like.df <-  likelist %>% bind_rows() %>% group_by(model)

means <- like.df %>% summarize(bezier.corrected = mean(bezier.corrected))
  

final_model <- bfcalcs(means)

final_model
## [1] model            bezier.corrected lbf              choice          
## [5] modelprob       
## <0 rows> (or 0-length row.names)

T-Test

Difference between stepping.stone and stepping.stone.KO is very significant. 😄

top.choice <- final_model$model[which(final_model$choice ==1)]
second.choice <- final_model$model[which(final_model$choice ==2)]

permTS(like.df$bezier.corrected[which(like.df$model == top.choice)],
       like.df$bezier.corrected[which(like.df$model == second.choice)],
       alternative = "greater", method = "exact.ce")

Gene Flow between Oahu and Kure

There is an intriguing result of nearly one-way gene flow from Oahu to Kure. Also, the stepping.stone.breaks.KO is much better than just stepping.stone.breaks. This suggests that a divergence parameter for these two populations is in order.

Gene Flow from Oahu to Kure

Gene Flow from Kure to Oahu

What about one way gene flow?

Stepping.stone.KO.oneway

#Stepping-Stone with a *one way* Kure-Oahu ongoing gene flow connection 
# (stepping.stone.KO.oneway)                                
population-relabel={1 2 3 4 5 1 7 8 9}                              
custom-migration={
*   *   0   0   0   *   0   0   0
*   *   *   0   0   0   0   0   0
0   *   *   *   0   0   0   0   0
0   0   *   *   *   0   0   0   0
0   0   0   *   *   *   0   0   0
*   0   0   0   *   *   *   0   0
0   0   0   0   0   *   *   *   0
0   0   0   0   0   0   *   *   *
0   0   0   0   0   0   0   *   *
}

More Replicates

Going to add more replicates of stepping.stone.KO and stepping.stone.KO.oneway

for a in $(seq 12 30); do cp -r rep11 rep$a; done

Copy It Down

rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf"  ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4.2 ./   

Results

Model Marginal Likelihoods

runDir <- "/Volumes/GoogleDrive/My Drive/ptuberculosa/migrate/run4"
#runDir <-   "/Volumes/GoogleDrive-103325533945714033178/My Drive/ptuberculosa/migrate/run4"

likelist <- list()
for(r in 1:30){
  rep = paste0("rep",r)
  print(rep)
  likelist[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}

# Model selection for each replicate...
#likelist %>% map(bfcalcs)

The final model marginal likelihood estimates based on the mean of 30 replicates:

like.df <-  likelist %>% bind_rows() %>% group_by(model)

means <- like.df %>% summarize(bezier.corrected = mean(bezier.corrected))
  

final_model <- bfcalcs(means)

final_model
## [1] model            bezier.corrected lbf              choice          
## [5] modelprob       
## <0 rows> (or 0-length row.names)

This time stepping stone with two-way Kure-Oahu connection beats out the one way connection

Adding divergence models

Peter says you can mix * with d parameters, and I tested it and it worked! Based on these results I’m going to add a model of one way ongoing migration from Oahu to Kure, and another model of divergence from Oahu to Kure about 80 years ago with no subsequent gene flow, and finally, divergence plus migration between Oahu to Kure.

stepping.stone.KO.D

Equilibrium Gene Flow, but divergence with gene flow from Oahu to Kure


#Stepping-Stone with a *one way* Kure-Oahu divergence with gene flow 
# (stepping.stone.KO.divmig)                                
population-relabel={1 2 3 4 5 1 7 8 9}                              
custom-migration=   
    Pop_Kure                    *   0   0   0   0   0   D   0   0
    Pop_P&H                   0 *   0   0   0   0   0   0   0
    Pop_MaroReef                0   *   *   *   0   0   0   0   0
    Pop_FFS                   0 0   *   *   *   0   0   0   0
    Pop_Kauai                   0   0   0   *   *   *   0   0   0
    Pop_Oahu                    0   0   0   0   *   *   *   0   0
    Pop_Molokai               0 0   0   0   0   *   *   *   0
    Pop_Maui                    0   0   0   0   0   0   *   *   *
    Pop_BigIsland               0   0   0   0   0   0   0   *   *

stepping.stone.KOd

Equilibrium Gene Flow, but divergence without gene flow from Oahu to Kure

#Stepping-Stone with a *one way* Kure-Oahu divergence without gene flow 
# (stepping.stone.KO.div)   
population-relabel={1 2 3 4 5 1 7 8 9}                              
custom-migration=   
    Pop_Kure                    *   0   0   0   0   0   d   0   0
    Pop_P&H                   0 *   0   0   0   0   0   0   0
    Pop_MaroReef                0   *   *   *   0   0   0   0   0
    Pop_FFS                   0 0   *   *   *   0   0   0   0
    Pop_Kauai                   0   0   0   *   *   *   0   0   0
    Pop_Oahu                    0   0   0   0   *   *   *   0   0
    Pop_Molokai               0 0   0   0   0   *   *   *   0
    Pop_Maui                    0   0   0   0   0   0   *   *   *
    Pop_BigIsland               0   0   0   0   0   0   0   *   *

stepping.stone.splitsD

North to South Divergence with migration

*   0   0   0   0   0   0   0   0
D   *   0   0   0   0   0   0   0
0   D   *   0   0   0   0   0   0
0   0   D   *   0   0   0   0   0
0   0   0   D   *   0   0   0   0
0   0   0   0   D   *   0   0   0
0   0   0   0   0   D   *   0   0
0   0   0   0   0   0   D   *   0
0   0   0   0   0   0   0   D   *

stepping.stone.splits.d

North to South Divergence without migration

*   0   0   0   0   0   0   0   0
d   *   0   0   0   0   0   0   0
0   d   *   0   0   0   0   0   0
0   0   d   *   0   0   0   0   0
0   0   0   d   *   0   0   0   0
0   0   0   0   d   *   0   0   0
0   0   0   0   0   d   *   0   0
0   0   0   0   0   0   d   *   0
0   0   0   0   0   0   0   d   *

stepping.stone.splitsD.KOsplitD

Here, population 1 isn’t born until it splits from population 6. I’m having trouble setting an individual prior for a very young divergence from 6 to 1- apparently I found a bug- Peter is fixing it.

*   0   0   0   0   d   0   0   0
0   *   0   0   0   0   0   0   0
0   D   *   0   0   0   0   0   0
0   0   D   *   0   0   0   0   0
0   0   0   D   *   0   0   0   0
0   0   0   0   D   *   0   0   0
0   0   0   0   0   D   *   0   0
0   0   0   0   0   0   D   *   0
0   0   0   0   0   0   0   D   *

Priors

And we need to add reasonable priors on divergence time to match the foundation of Naval Air Station Midway in 1941, with a standard deviation of say 20 years because it was placed under Navy control in 1903. Splitting times \(\tau\) in Migrate are in units of \(\mu\) x generations. I will assume a nuclear mutation rate of 1e-9 and a generation time of 1 year. Peter confirmed

#take the mean of the modes from one replicate of the stepping.stone.KO model
#meantheta <- mean(0.00570+0.00450+0.00450+0.00490+0.00630+0.00510+0.00570+0.00510+0.00590)
mu <- 1e-9
generation_time <- 1

mean_split <- 80*mu/generation_time
std_split <- 20*mu/generation_time
max_split <- 163 * mu/generation_time

80 years ago is 8^{-8}, and a max of 163 years (Midway discovered in 1859) which is 1.63^{-7}.

It turns out that setting individual priors on divergence isn’t working right now, so I’m just going to have to set a wide prior if there are other d or D parameters and only can set the specific prior if the Kure Oahu split is the only split in the model.

This results in adding the following two lines to the parmfiles with only the Kure Oahu split.

bayes-priors= SPLIT * * WEXPPRIOR: 0.0 8e-8 1.63e-7 1.63e-8 
bayes-priors= SPLITSTD * * WEXPPRIOR: 0.0 8e-8 1.63e-7 1.63e-8

Unfortunately the prior above was resulting in error messages.

[1]    63427 illegal hardware instruction  /Applications/migrate/migrate-4.4.4/migrate-n

From the output, it looks like it can’t consider priors smaller than 1e-6. But even using 1e-5 or 1-e4 results in errors.

So all will use this wider prior. Also, I am using exponentially distributed priors, so the SPLITSTD is not going to be used.

bayes-priors= SPLIT * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000
bayes-priors= SPLITSTD * *  WEXPPRIOR: 0.0 0.001 0.1000000 0.01000

Copy It Up

Copy it up, and then make a total of 10 replicate copies of the folder.

scp -r ./run4.1 ecrandall@nautilus.psu.edu:./Ptuberculosa/run4.1    
mkdir rep1
mv step* rep1
cp ../run4/ptuberculosa.mig ./ptuberculosa.mig
cp ../run4/mpi_run_migrate-n.sh ./mpi_run_migrate-n.sh
cd rep1
#once on server, copy it 9 times
for a in $(seq 2 10); do cp -r rep1 rep$a; done

And Back down


rsync -av -e ssh --exclude='bayes*'  ecrandall@nautilus.psu.edu:~/Ptuberculosa/run4.5 ./

rsync -av -e ssh --exclude='bayes*' --exclude="*.pdf"  eric@moneta.tobolab.org:~/Palythoa/run4.6 ./run4.6

# then copied them into the run4 folder with:
# the exclude statement excludes stepping.stone.KOd and stepping.stone.KO.D which didn't complete.
rsync -av --progress run4.5/ run4/ --exclude '*KO[\.d]*'  

rsync -av --progress run4.6/ run4/

Results

Model Marginal Likelihoods

#runDir <- "/Volumes/GoogleDrive/My Drive/ptuberculosa/migrate/run4"
runDir <-"/Users/eric/Library/CloudStorage/GoogleDrive-edc5240@psu.edu/My Drive/ptuberculosa/migrate/run4"

likelist <- list()
for(r in 1:30){
  rep = paste0("rep",r)
  print(rep)
  likelist[[rep]] <- harvest.model.likelihoods(workingDir=file.path(runDir,rep))
}

# Model selection for each replicate...
likelist %>% map(bfcalcs)

The final model marginal likelihood estimates based on the mean of 10 replicates:

like.df <-  likelist %>% bind_rows() %>% group_by(model)

means <- like.df %>% summarize(bezier.corrected = mean(bezier.corrected))
  

final_model <- bfcalcs(means)

final_model
##                              model bezier.corrected       lbf choice modelprob
## 1                           island        -75311.81 -41691.66     10         0
## 2                         NWHI_MHI        -72281.30 -35630.64      9         0
## 3                         panmixia        -72172.88 -35413.80      8         0
## 4                   stepping.stone        -71256.23 -33580.50      7         0
## 5            stepping.stone.breaks        -71110.23 -33288.51      6         0
## 6         stepping.stone.breaks.KO        -70307.17 -31682.39      5         0
## 7                stepping.stone.KO        -69261.69 -29591.42      3         0
## 8              stepping.stone.KO.D        -55474.16  -2016.36      2         0
## 9         stepping.stone.KO.oneway        -69296.21 -29660.46      4         0
## 10              stepping.stone.KOd        -54465.98      0.00      1         1
## 11         stepping.stone.splits.d        -77896.08 -46860.21     13         0
## 12          stepping.stone.splitsD        -77091.94 -45251.93     12         0
## 13 stepping.stone.splitsD.KOsplitd        -76884.10 -44836.24     11         0

T-Test

The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The difference between the top two models is significant. 😆

top.choice <- final_model$model[which(final_model$choice ==1)]
second.choice <- final_model$model[which(final_model$choice ==2)]

TS <- permTS(like.df$bezier.corrected[which(like.df$model == top.choice)],
       like.df$bezier.corrected[which(like.df$model == second.choice)],
       alternative = "greater")

TS

Figures

And some figures summarizing all this. The two best models are equilibrium models across the archipelago, with recent divergence from Oahu to Kure. The best one is divergence without gene flow, followed by divergence with gene flow. The three models with divergences are much worse than those without. Difference between stepping.stone and stepping.stone.KO is very significant.

models <- c("N-Island", "NWHI vs. MHI"," Panmixia", "SS: Islands",
            "SS: Regional Breaks", "SS: Regional Breaks, KO 2way",
            "SS: Islands, OK 2way", 
            "SS: Islands, O2K Colonization with GF", 
            "SS: Islands, O2K 1 way", 
            "SS: Islands, O2K Colonization without GF",
            "Seq Div: No GF","Seq Div: GF",
            "Seq Div: GF, O2K Colonization without GF")

likesPlot <- likelist %>% bind_rows() %>% group_by(model) %>% 
              ggplot(mapping = aes(x=model, y = bezier.corrected)) +
              geom_violin(draw_quantiles = 0.5) +
              theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
              scale_x_discrete(labels = models) +
              labs(x = "Metapopulation Model", y = "Bezier Corrected Marginal Likelihood")
likesPlot

Zoom in on the top two models

likesPlot_zoom <- likesPlot + ylim(-69400, -54000) + geom_point()

likesPlot_zoom
## Warning: Removed 92 rows containing non-finite values (stat_ydensity).
## Warning: Removed 92 rows containing missing values (geom_point).

Model Parameter Estimates

Because I set such wide priors on gene flow for marine species, the standard PDF output does a poor job of binning the data, such that even though an equilibrium model of gene flow is the best model, Migrate is telling me that all migration parameters have a mode of 2.5 and 0 at the 25th percentile. It would be nice if I could look at the posterior calculated across all loci myself, but it is unclear how Migrate does this exactly. I inquired about this to Peter a year ago, and he got back to me with this method for summing the posterior over all loci and some Python code as a demo.

  1. create a histogram for each locus
  2. log the histogram (there will be issues with zero value cells) *. migrate smoothes each locus histogram
  3. subtract the log(prior)
  4. sum all log histograms for each locus
  5. add the log prior back
  6. convert to non-log
  7. adjust so that the sum of the new histogram sums t0 1.0
winningModelDir <- "/Users/eric/Library/CloudStorage/GoogleDrive-edc5240@psu.edu/My Drive/ptuberculosa/migrate/run4.6/rep1/stepping.stone.KOd"
popkey <- read_csv("popkey.csv")
## Rows: 9 Columns: 2── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Pop
## dbl (1): Index
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(popkey$Pop) <- popkey$Index




#this may take a minute or two
posterior <- read.table(file.path(winningModelDir,"bayesallfile"), header=T) 

#calculate migrants per generation
posterior <- migrants.per.gen(posterior)



posterior_long <- posterior %>% pivot_longer(starts_with(c("Theta_", "M_", "Nm_","D_", "d_")),
                                                         names_to = "parameter", values_to = "value")
#make a new labeling key
posterior_parameters <-posterior_long$parameter[1:37]



parameter_key <- posterior_parameters %>% 
                                  str_replace("(\\d)_(\\d)", "\\1 to \\2") %>%
                                  str_replace("M_", "m/mu " ) %>% 
                                  str_replace("Theta_", "theta ") %>% 
                                  str_replace_all(popkey$Pop) %>% 
                                  str_replace("Nm_", "4Nem ") 
                                  

names(parameter_key) <- posterior_parameters

Theta plots

Here are plots of each parameter posterior, with each colored line being the posterior from one of 106 loci. Next thing I have to do is code up the summation described by Peter above.

\(\theta=4N_e\mu\)

Theta_plot <- posterior_long %>%  filter(str_detect(parameter, "Theta_")) %>%  ggplot() + 
                                   geom_density(mapping = aes(x = value, group=Locus, color =Locus), weight = 0.5) + 
                              scale_x_log10(limits=c(1e-4,0.1)) + facet_wrap(~parameter, scales = "free_y",
                                                    labeller = labeller(parameter= parameter_key), 
                                                    dir = "h",nrow=2) +
                                                    theme(axis.text.x = element_text(size = rel(0.6)))

Theta_plot
## Warning: Removed 140872 rows containing non-finite values (stat_density).

Migration plots

Proportion of migrants relative to mutation rate \(\frac{m}{\mu}\)

M_plot <- posterior_long %>%  filter(str_detect(parameter, "M_")) %>%  ggplot() + 
                                   geom_density(mapping = aes(x = value, group=Locus, color =Locus), weight = 0.5) + 
                              scale_x_log10(limits=c(1e-4,1000)) + facet_wrap(~parameter, scales = "free_y", 
                                                    labeller = labeller(parameter= parameter_key), 
                                                    dir = "v",nrow=2) +
                                                    theme(axis.text.x = element_text(size = rel(0.6)))
M_plot
## Warning: Removed 26922 rows containing non-finite values (stat_density).

Gene Flow plots

Migrants per generation \(4N_em\)

Nm_plot <- posterior_long %>%  filter(str_detect(parameter, "Nm_")) %>%  ggplot() + 
                                   geom_density(mapping = aes(x = value, group=Locus, color =Locus), weight = 0.5) + 
                              scale_x_log10(limits=c(1e-6,10)) + facet_wrap(~parameter, scales = "free_y",
                                                    labeller = labeller(parameter= parameter_key), 
                                                    dir = "v",nrow=2) +
                                                    theme(axis.text.x = element_text(size = rel(0.6)))

Nm_plot
## Warning: Removed 13128 rows containing non-finite values (stat_density).

Literature Cited